

/*
* Make regression data

i.e. combine all safegraph data and also add covariates and ranking of zips
*/

* local for variables based on which ranking is defined
local rank_var abv_md_cty_pc_c_11
local contr_rank_var vent_cty_pc_c_11_pc vent_cty_pc_c_13_pc vent_cty_pc_c_17_pc  ///
	perc_nat_pc_c_11_pc perc_nat_pc_c_13_pc perc_nat_pc_c_17_pc

* POI data
use "${data_derived_poi}/zip_poi_visits_by_ind_weekly.dta", clear

* week 31 not complete so we drop it (--> last day in sample is aug 2 but week 31 goes until aug 4 (in stata))
drop if week >=31

* give shorter names to NAICS categories
replace naics_short ="entertain" if naics_short == "entertainment"
replace naics_short ="fd_plces" if naics_short == "food_places"
replace naics_short ="fd_stores" if naics_short == "food_stores"
replace naics_short ="health" if naics_short == "health_care"
replace naics_short ="retail" if naics_short == "other_retail_trade"
replace naics_short ="miss" if naics_short == "."

* reshape wide on industry codes
drop impute_zero exist_at_baseline
rename (num_visits number_poi) (num_visits_ number_poi_)
greshape wide num_visits number_poi, i(zcta week) j(naics_short) string

* create total over all industries 
order num_visits*, last
foreach var in num_visits number_poi {
	gegen `var'_total = rowtotal(`var'_11-`var'_92)
	replace `var'_total = `var'_total + `var'_miss if `var'_miss !=.
	}

* replace raw visit counts by log of visits 
foreach var of varlist num_visits_* {
	qui: replace `var' = log(1+`var')
	ren `var' l_`var'
	}

* delete individual industry visit counts 
drop l_num_visits_11-l_num_visits_92 number_poi_11-number_poi_92 

* merge with social distancing data 
fmerge 1:1 zcta week using "${data_derived_sd}/zip_soc_dist_weekly.dta", ///
	assert(1 2 3) nogen
gen pct_devices_home = completely_home_device_count / device_count
gen l_avg_dist = log(avg_distance_traveled) 
* calculate percentage change in distance traveled relative to january
gegen mean_avg_dist_traveled_jan_h = mean(avg_distance_traveled) if inrange(week, 1, 4), by(zcta) 
gegen mean_avg_dist_traveled_jan = mean(mean_avg_dist_traveled_jan), by(zcta)
drop mean_avg_dist_traveled_jan_h
gen pct_chg_dist_trvld = ///
	(avg_distance_traveled-mean_avg_dist_traveled_jan)/mean_avg_dist_traveled_jan*100

* merge with transaction data 
gen year = 2020 

* now add covariates
preserve
	use "${data_derived_covs}/zip_covariates.dta", clear
	* create fraction variables
	foreach cov of varlist men white black asian male_mngmt_bus_sci_arts male_service_occ male_sales_office male_natres_cons_maint male_prod_trans female_mngmt_bus_sci_arts female_service_occ female_sales_office female_natres_cons_maint female_prod_trans men_* women_* hs_ged some_coll coll {
		destring `cov', replace force
		drop if missing(`cov')
		replace `cov' = `cov' / pop
	}	

	* group some of these variables 
	foreach var in service_occ prod_trans mngmt_bus_sci_arts {
		gen total_`var' = male_`var' + female_`var'
		drop male_`var' female_`var'
	}
	foreach age in below_18 18_24 25_34 35_44 45_54 55_64 65_74 75_and_above {
		gen total_age_`age' = men_`age' + women_`age'
		drop men_`age' women_`age'
	}
	tempfile covs
	save `covs'
restore

* drop observations for which we don't have covariates, those are strange
fmerge m:1 zcta using `covs', assert(1 2 3) keep(3) nogen

* add indicators for above/below median SCI weighted exposure 
* this also adds county fips codes
fmerge m:1 zcta using "${data_derived_exposure}/sci_weighted_cases_rank", nogen ///
	keep(3) assert(1 2 3) keepusing(`rank_var' `contr_rank_var' ///
	fips vent_nat*pc*inc* vent_nat*pc*urban* vent_nat*pc*dens*)
	
ren `rank_var' high_exp	
	
* drop observations from 2019 for which no corresponding week in 2020 is observed
qui: su week if year ==2020
drop if week > `r(max)'	
	
* save
save "${data_derived}/zip_regression_data.dta", replace
	
